import plotly.plotly as py
import cufflinks as cf
import pandas as pd
import numpy as np
cf.set_config_file(offline=True, world_readable=False, theme='ggplot')
df_d = pd.read_csv("days_data.csv")
df_m = pd.read_csv("minute_data.csv")
df_s = pd.read_csv("second_data.csv")
Each row is a single day
df_d.head(5)
df_d.iplot(y="count")
Each row is a single minute
df_m.head(5)
df_m.iplot(y="count")
Each row is a single second
df_s.head(5)
df_s.iplot(y="count")
import datetime
import random
from itertools import cycle
import random
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
# Keep getting data from our df_m (minute) data in a circular way
# This allows us to generate data and keep repeating until we run out of days in our dataset
def get_circular_data():
circular_list = cycle(df_m["count"].values)
return circular_list
def generate_data(start, end, step, my_func):
dt_first_timestamp_in_sec = int(start.timestamp())
dt_first_timestamp_in_sec
result = []
while start < end:
timestamp = start.strftime('%Y-%m-%d %H:%M:%S')
result.append({"timestamp":timestamp,
"value": next(my_func)
})
start += step
return result
# Generate some data from Feb 1st 2019 to Feb 5th 2019
start = datetime.datetime(2019, 2, 1)
end = datetime.datetime(2019, 2, 5, 23, 59, 59)
step = datetime.timedelta(minutes=1) # By minutes (because we're using our minute dataset)
df = pd.DataFrame(generate_data(start,end,step,get_circular_data()))
df.iplot(y="value")
df.head()
# Latency is loosely related to traffic volume
# Although good systems don't see an increase in latency when the traffic increases
def latency(value):
latency = 500 # 100 ms is the default
if value > 9000:
latency = latency * (1.01+random.uniform(0, 1))
if value > 11000:
latency = latency * (1.09+random.uniform(0, 1))
if value < 5000:
latency = latency * (0.7+random.uniform(0, 1))
return latency
# We don't want to see many database errors, so let's simulate rare occurences
def db_errors(value, host_num):
if host_num == 1:
1 if random.randint(0,100000) > 77777 else np.NaN
if host_num == 2:
1 if random.randint(0,100000) > 99998 else np.NaN
if host_num == 3:
1 if random.randint(0,1000000) > 999999 else np.NaN
return 1 if random.randint(0,100) > 99 else np.NaN
df["http_500"] = df["value"]*.05 # Server Error
df["http_404"] = df["value"]*.1 # Page not found
df["http_200"] = df["value"]*5 # OK
df["login_success"] = df["value"]*.6
df["latency"] = df["value"].apply(lambda x: latency(x))
df["db_errors_host01"] = df["value"].apply(lambda x: db_errors(x,1))
df["db_errors_host02"] = df["value"].apply(lambda x: db_errors(x,2))
df["db_errors_host03"] = df["value"].apply(lambda x: db_errors(x,3))
df.head()
df.iplot(x="timestamp", y=["http_500", "http_404"])
df.iplot(x="timestamp", y="latency")
# We should less less logins per HTTP 200 since 200 OKs happen a lot
df.iplot(x="timestamp", y=["login_success", "http_200"])
df.index = pd.to_datetime(df["timestamp"])
# Down-sample every 60 seconds
resampled = df[["timestamp","db_errors_host01","db_errors_host02","db_errors_host03"]].resample("60T").sum().reset_index()
resampled.iplot(kind="scatter", mode="markers", x="timestamp")